### Summary ####
# Input: cnt20 and fmli193 - fmli212
# Output: Output df_all_cu (all households, for Section III analysis of the paper) 
#         and df_f (final sample, for the main analysis)
#         and cnt20_cleaned, for summary statistics

# Outline: 
# 1. Merge EIP information into the fmli files 
# 2. Create the basic panel, constructing variables used in the regression analysis
# 3. Update the panel by computing first differences 
# 4. Compute weights, income, and liquidity
# 5. Compute average expenditure at the CU level
# 6. Incorporate variables created in 4 and 5 into the panel, and re-arrange 
# 7. Create "All household sample" and "final sample" 

### 0 Preparations ####
setwd(getwd())

# Open libraries 
library(readxl) # for importing raw data
library(dplyr) # for data processing
library(quantreg) # for quantile regressions used in data cleaning

# Import datasets
cnt20 <- read_excel("Raw data/cnt20.xlsx")
fmli193 <- read_excel("Raw data/fmli193.xlsx")
fmli194 <- read_excel("Raw data/fmli194.xlsx")
fmli201 <- read_excel("Raw data/fmli201.xlsx")
fmli202 <- read_excel("Raw data/fmli202.xlsx")
fmli203 <- read_excel("Raw data/fmli203.xlsx")
fmli204 <- read_excel("Raw data/fmli204.xlsx")
fmli211 <- read_excel("Raw data/fmli211.xlsx")
fmli212 <- read_excel("Raw data/fmli212.xlsx") # only for liquidity

# Create copies of raw files 
fmli193_copy <- fmli193
fmli194_copy <- fmli194
fmli201_copy <- fmli201
fmli202_copy <- fmli202
fmli203_copy <- fmli203
fmli204_copy <- fmli204
fmli211_copy <- fmli211
fmli212_copy <- fmli212


# Change QINTRVMO (interview month) to YYMM

fmli193 <- fmli193 %>% mutate(
  YYMM = ifelse(QINTRVMO==7,1907,
                ifelse(QINTRVMO==8,1908,1909))) %>%
  select(-QINTRVMO)

fmli194 <- fmli194 %>% mutate(
  YYMM = ifelse(QINTRVMO==10,1910,
                ifelse(QINTRVMO==11,1911,1912))) %>%
  select(-QINTRVMO)

fmli201 <- fmli201 %>% mutate(
  YYMM = ifelse(QINTRVMO==1,2001,
                ifelse(QINTRVMO==2,2002,2003))) %>%
  select(-QINTRVMO)

fmli202 <- fmli202 %>% mutate(
  YYMM = ifelse(QINTRVMO==4,2004,
                ifelse(QINTRVMO==5,2005,2006))) %>%
  select(-QINTRVMO)

fmli203 <- fmli203 %>% mutate(
  YYMM = ifelse(QINTRVMO==7,2007,
                ifelse(QINTRVMO==8,2008,2009))) %>%
  select(-QINTRVMO)

fmli204 <- fmli204 %>% mutate(
  YYMM = ifelse(QINTRVMO==10,2010,
                ifelse(QINTRVMO==11,2011,2012))) %>%
  select(-QINTRVMO)

fmli211 <- fmli211 %>% mutate(
  YYMM = ifelse(QINTRVMO==1,2101,
                ifelse(QINTRVMO==2,2102,2103))) %>%
  select(-QINTRVMO)

# Change CONTMO (receipt month) to RYYMM
cnt20 <- cnt20 %>% filter(CONTCODE==800) %>%
  mutate(RYYMM = ifelse(CONTMO==4,2004,
                        ifelse(CONTMO==5,2005,
                               ifelse(CONTMO==6,2006,
                                      ifelse(CONTMO==7,2007,
                                             ifelse(CONTMO==8,2008,
                                                    ifelse(CONTMO==9,2009,
                                                           ifelse(CONTMO==10,2010,
                                                                  ifelse(CONTMO==11,2011,
                                                                         ifelse(CONTMO==12,2012,
                                                                                ifelse(CONTMO==1,2101,2102))))))))))) %>% 
  select(-CONTMO)

### 1 Clean cnt20 and combine with fmli's ####

#### 1.0 Modify EIP II reported in November ####
# Get all rebates reported in November 
cnt_nov <- cnt20 %>% filter(RYYMM==2011 & CONTCODE==800) %>% 
  mutate(ID = substr(as.character(NEWID),1,6))

# Use family files to get family size 
fmli204n211 <- bind_rows(fmli204,fmli211)

fmli204n211 <- fmli204n211 %>%
  select(NEWID,YYMM,FAM_SIZE)

cnt_nov <- merge(cnt_nov,fmli204n211,by="NEWID")

# Adding up rebates reported in a same interview by a same CU

# One may contend that this step will complicate the cases where 
# A CU reports both EIP I and EIP II in Nov
# BUT these cases are quite rare. Can check by:
# cnt_nov_1 <- cnt_nov %>% group_by(NEWID) %>% filter( n() == 1)
# cnt_nov_2 <- cnt_nov %>% group_by(NEWID) %>% filter( n() == 2)

cnt_nov <- cnt_nov %>% group_by(NEWID) %>%
  mutate(EIP=sum(CONTEXPX)) %>% 
  distinct(NEWID,.keep_all = TRUE)

# Change a Nov rebate to a Dec rebate if it is no greater than family size times 600
# Of course, one might be concerned about small amounts due to phase-out
# But there is only one nov rebate that is less than 600
# And we maintain that CUs who are subject to phase-out, meaning CUs with high income --
# are unlikely to receive EIP I this late
cnt_nov_dec <- cnt_nov %>% filter(EIP <= FAM_SIZE * 600)

# Keep Nov rebates in Nov if payment is greater than family size times 600
cnt_nov_nov <- cnt_nov %>% filter(EIP > FAM_SIZE * 600)

# List of NEWIDs that requires NEWID adjustment
# Note that for these NEWIDs 1) interviewed in Dec and 2) with Nov rebates moved to Dec
# The rebate is in fact in the reference period of the next interview
# Hence, in the next step, for these rebates, we add the corresponding NEWID by 1 
cnt_nov_dec_add <- cnt_nov_dec %>% 
  filter(YYMM==2012)

# Adjust the receipt month and NEWID accordingly 
cnt20 <- cnt20 %>% 
  mutate(RYYMM = ifelse((NEWID %in% cnt_nov_nov$NEWID) & RYYMM == 2011, 2011, 
                        ifelse((NEWID %in% cnt_nov_dec$NEWID) & RYYMM == 2011, 2012, RYYMM)),
         NEWID = ifelse(NEWID %in% cnt_nov_dec_add$NEWID & RYYMM==2012, as.character(as.numeric(NEWID)+1), NEWID))


# After this step, only Jan interview in fmli211 are needed.
# Let's keep only these 
fmli211 <- fmli211 %>% filter(YYMM==2101)
fmli211_copy <- fmli211_copy %>% filter(QINTRVMO==1)

#### 1.1 Find all rebates ####
# Those who do not receive EIP has variable EIP = 0
cnt20 <- cnt20 %>% 
  mutate(EIP = CONTEXPX)

# Get all rebates
# Dropping RYYMM 2012, 2101, 2102 to avoid counting EIP II as EIP I
cnt20_rc <- cnt20 %>% filter(EIP != 0 & RYYMM!=2012 & RYYMM!=2101 & RYYMM!=2102) %>% 
  select(NEWID,RYYMM,CHCKEFT,REBTUSED,EIP) %>%
  rename(EIPI = EIP)

# NEWID with different number of rebates 
# rb1 <- cnt20_rc %>% group_by(NEWID) %>% filter( n() == 1)
# rb2 <- cnt20_rc %>% group_by(NEWID) %>% filter( n() == 2)
# rb3 <- cnt20_rc %>% group_by(NEWID) %>% filter( n() == 3)
# rb4 <- cnt20_rc %>% group_by(NEWID) %>% filter( n() == 4)
# rb5 <- cnt20_rc %>% group_by(NEWID) %>% filter( n() == 5)
# rb6 <- cnt20_rc %>% group_by(NEWID) %>% filter( n() == 6)
# rb7 <- cnt20_rc %>% group_by(NEWID) %>% filter( n() == 7)
# rb8 <- cnt20_rc %>% group_by(NEWID) %>% filter( n() == 8)

#### 1.2 Modify missing values for disbursement method and usage ####
# Keep missing disbursement method as missing
# Do nothing here 

# Adjust so that missing usage is the same as the reported usage for the other 
# EIPI received in the same reference period, if the CU has more than one EIPIs
# (0nly consider the 2 EIPIs case below,
# since no CU receive more than two EIPIs in a reference period has this issue)
# One can also achieve this by merge, but loop is more straightforward and easy to run here
for (i in 1:(nrow(cnt20_rc)-1)){
  if(cnt20_rc$NEWID[i]==cnt20_rc$NEWID[i+1] & is.na(cnt20_rc$REBTUSED[i]) & !is.na(cnt20_rc$REBTUSED[i+1])){
    cnt20_rc$REBTUSED[i] = cnt20_rc$REBTUSED[i+1]
  }
  else if(cnt20_rc$NEWID[i]==cnt20_rc$NEWID[i+1] & is.na(cnt20_rc$REBTUSED[i+1]) & !is.na(cnt20_rc$REBTUSED[i])){
    cnt20_rc$REBTUSED[i+1] = cnt20_rc$REBTUSED[i]
  }
}

write.csv(cnt20_rc,"cnt20_cleaned.csv")

#### 1.3 Create rebate variables at the interview level ####

# Create EIPI by type
cnt20_rc <- cnt20_rc %>% 
  mutate(EIPI_ck = ifelse(CHCKEFT==1, EIPI, 0), # by check
         EIPI_dd = ifelse(CHCKEFT==2, EIPI, 0), # by direct deposit
         EIPI_dc = ifelse(CHCKEFT==3, EIPI, 0), # by debit card
         EIPI_ep = ifelse(REBTUSED==1, EIPI, 0), # for expenses 
         EIPI_debt = ifelse(REBTUSED==2,EIPI, 0), # for debt
         EIPI_sv = ifelse(REBTUSED==3, EIPI, 0), # for savings 
         
         # By receipient month
         EIPI_Apr = ifelse(RYYMM==2004, EIPI, 0),
         EIPI_May = ifelse(RYYMM==2005, EIPI, 0),
         EIPI_Jun = ifelse(RYYMM==2006, EIPI, 0),
         EIPI_Jul = ifelse(RYYMM==2007, EIPI, 0),
         EIPI_Aug = ifelse(RYYMM==2008, EIPI, 0),
         EIPI_Sep = ifelse(RYYMM==2009, EIPI, 0),
         EIPI_Oct = ifelse(RYYMM==2010, EIPI, 0),
         EIPI_Nov = ifelse(RYYMM==2011, EIPI, 0))

# Merge all EIPIs received by a household in the same reference period, i.e. group by NEWID 
cnt20_rc <- cnt20_rc %>%
  group_by(NEWID) %>%
  mutate(EIPI_t=sum(EIPI),EIPI_by_ck_t=sum(EIPI_ck),EIPI_by_dd_t=sum(EIPI_dd),EIPI_by_dc_t=sum(EIPI_dc),
         EIPI_for_ep_t=sum(EIPI_ep),EIPI_for_debt_t=sum(EIPI_debt),EIPI_for_sv_t=sum(EIPI_sv),
         EIPI_apr_t=sum(EIPI_Apr),EIPI_may_t=sum(EIPI_May),EIPI_jun_t=sum(EIPI_Jun),EIPI_jul_t=sum(EIPI_Jul),
         EIPI_aug_t=sum(EIPI_Aug),EIPI_sep_t=sum(EIPI_Sep),EIPI_oct_t=sum(EIPI_Oct),EIPI_nov_t=sum(EIPI_Nov),
         EIPI_t_count = n()) %>%
  ungroup()

# Create indicator variables
cnt20_rc <- cnt20_rc %>% 
  mutate(iEIPI_t = ifelse(EIPI_t>0,1,0),
         iEIPI_by_ck_t = ifelse(EIPI_by_ck_t>0,1,0),
         iEIPI_by_dd_t = ifelse(EIPI_by_dd_t>0,1,0),
         iEIPI_by_dc_t = ifelse(EIPI_by_dc_t>0,1,0),
         iEIPI_for_ep_t = ifelse(EIPI_for_ep_t>0,1,0),
         iEIPI_for_debt_t = ifelse(EIPI_for_debt_t>0,1,0),
         iEIPI_for_sv_t = ifelse(EIPI_for_sv_t>0,1,0))

# Drop pre-aggregated EIP variables
# Drop repetitions created by group_by() and mutate()
# cnt20_rc now contains all EIPI information aggregated to the interview level
cnt20_rc <- cnt20_rc %>% select(-c(2:19)) %>% 
  distinct(NEWID,.keep_all = TRUE)

#### 1.4 Identify different types of CUs  ####

# Create CU ID by dropping the last digit of NEWID (interview number)
lists_base <- cnt20_rc 
lists_base$ID <- as.character(lists_base$NEWID)
lists_base$ID <- substr(lists_base$NEWID,1,nchar(lists_base$NEWID)-1)

# Aggregate all EIPIs received by a CU
lists_base <- lists_base %>%
  group_by(ID) %>%
  mutate(TEIPI=sum(EIPI_t),TEIPI_by_ck=sum(EIPI_by_ck_t),TEIPI_by_dd=sum(EIPI_by_dd_t),TEIPI_by_dc=sum(EIPI_by_dc_t),
         TEIPI_for_ep=sum(EIPI_for_ep_t),TEIPI_for_debt=sum(EIPI_for_debt_t),TEIPI_for_sv=sum(EIPI_for_sv_t)) %>% 
  ungroup() %>% 
  distinct(ID,.keep_all = TRUE)

# CUs that are recipients 
list_r <- lists_base %>% filter(TEIPI >0) %>%
  select("ID") 
# CUs that receive at least one EIPI by check 
list_ck <- lists_base %>% filter(TEIPI_by_ck >0) %>%
  select("ID")
# CUs that receive at least one EIPI by direct deposit
list_dd <- lists_base %>% filter(TEIPI_by_dd >0) %>%
  select("ID")
# CUs that receive at least one EIPI by debit card
list_dc <- lists_base %>% filter(TEIPI_by_dc >0) %>%
  select("ID")
# CUs that use at least one EIPI "mostly for expense"
list_ep <- lists_base %>% filter(TEIPI_for_ep >0) %>%
  select("ID")
# CUs that use at least one EIPI "mostly to pay off debt"
list_debt <- lists_base %>% filter(TEIPI_for_debt >0) %>%
  select("ID")
# CUs that use at least one EIPI "mostly to increase savings"
list_sv <- lists_base %>% filter(TEIPI_for_sv >0) %>%
  select("ID")

#### 1.5 Find all interview without rebates ####
fmli <- bind_rows(fmli193,fmli194,fmli201,fmli202,
                  fmli203,fmli204,fmli211)

fmli_flt <- fmli %>% 
  select (NEWID) %>% 
  mutate(
    # Not exactly true for May 2020 interviews
    # but are taken care of later in the function df_modifier()
    EIPI_t = 0,
    EIPI_by_ck_t = 0,
    EIPI_by_dd_t = 0,
    EIPI_by_dc_t = 0,
    EIPI_for_ep_t = 0,
    EIPI_for_debt_t = 0,
    EIPI_for_sv_t = 0,
    
    iEIPI_t = 0,
    iEIPI_by_ck_t = 0,
    iEIPI_by_dd_t = 0,
    iEIPI_by_dc_t = 0,
    iEIPI_for_ep_t = 0,
    iEIPI_for_debt_t = 0,
    iEIPI_for_sv_t = 0,
    
    EIPI_apr_t = 0,
    EIPI_may_t = 0,
    EIPI_jun_t = 0,
    EIPI_jul_t = 0,
    EIPI_aug_t = 0,
    EIPI_sep_t = 0,
    EIPI_oct_t = 0,
    EIPI_nov_t = 0,
    
    EIPI_t_count = 0
    
    # Note: not exactly true for May 2020 interviews
    # but May interviews will not be included anyways
    
  )

# cnt20_nr contains all interviews without rebates reported 
cnt20_nr <- fmli_flt %>% filter(!(NEWID %in% cnt20_rc$NEWID))

#### 1.6 Create basic df with all rebate information ####
# Merge to form a cnt20_f that contains all information about rebates
cnt20_f <- rbind(cnt20_rc,cnt20_nr)
# Merge df with fmli (fmli193 to fmli204)
# df now contains all EIPI information as well as other info already in fmli
df <- merge(fmli,cnt20_f,by="NEWID")

### 2  Panel without first difference ####
# df_modifier keeps only CU with an interview in June or July 2020 in the sample
# It then select CE variables in df that are relevant to the study 
# Finally, it constructs variables used in our study
# including demographics, expenditure, lagged EIPIs, and group dummies

df_modifier <- function(df){
  
  #### Create CU ID #### 
  # Drop the last digit of NEWID
  df <- df %>% mutate(
    ID = substr(as.character(NEWID),1,6))
  
  #### Selection of CUs ####
  # Keep only CUs that meet the basic criterion to stay in the sample
  jun_list <- df %>% filter(YYMM==2006) %>% select(ID)
  jul_list <- df %>% filter(YYMM==2007) %>% select(ID)
  
  df <- df %>%
    filter(ID %in% jun_list$ID | ID %in% jul_list$ID) 
  
  #### Selecting relevant CE variables ####
  df <- df %>% select(
    # interview info
    ID, NEWID, YYMM, INTERI,
    
    # Demographics
    PERSLT18, FAM_SIZE, AGE_REF, AGE2, SEX_REF,MARITAL1,CUTENURE,
    
    # EIPIs
    EIPI_t, EIPI_by_ck_t, EIPI_by_dd_t, EIPI_by_dc_t,
    EIPI_for_ep_t, EIPI_for_debt_t, EIPI_for_sv_t, iEIPI_t, iEIPI_by_ck_t, iEIPI_by_dd_t, iEIPI_by_dc_t,
    iEIPI_for_ep_t, iEIPI_for_debt_t, iEIPI_for_sv_t, 
    EIPI_apr_t,EIPI_may_t,EIPI_jun_t,EIPI_jul_t,EIPI_aug_t,EIPI_sep_t,EIPI_oct_t,EIPI_nov_t,EIPI_t_count,
    
    # Food expenditure
    FDAWAYCQ, FDAWAYPQ, FDHOMECQ,FDHOMEPQ,
    FOODCQ, FOODPQ, ALCBEVCQ, ALCBEVPQ, 
    
    # Strict non-durable expenditure
    UTILCQ, UTILPQ, HOUSOPCQ, HOUSOPPQ,
    PUBTRACQ, PUBTRAPQ, GASMOCQ, GASMOPQ,
    PERSCACQ, PERSCAPQ, TOBACCCQ, TOBACCPQ,
    MISCCQ, MISCPQ, 
    
    # Non-durables expenditure
    APPARCQ, APPARPQ,
    HEALTHCQ, HEALTHPQ, READCQ, READPQ,
    
    # Total expenditure
    TOTEXPCQ, TOTEXPPQ, 
    HOUSCQ, HOUSPQ, EDUCACQ, EDUCAPQ,
    ENTERTCQ, ENTERTPQ, TRANSCQ, TRANSPQ,
    CASHCOCQ, CASHCOPQ) %>% 
    
    #### Create demographic and expenditure variables studied ####
  mutate(
    
    # Demographics
    NUM_KIDS = PERSLT18,
    NUM_ADTS = FAM_SIZE - PERSLT18,
    AGE_AVG = ifelse(is.na(AGE2),AGE_REF,(AGE_REF + AGE2)/2),
    
    # Four big expenditure categories
    EX_FD = FOODCQ + FOODPQ + ALCBEVCQ + ALCBEVPQ,
    
    EX_SN = FOODCQ + FOODPQ + ALCBEVCQ + ALCBEVPQ +
      UTILCQ + UTILPQ + HOUSOPCQ + HOUSOPPQ +
      PUBTRACQ + PUBTRAPQ + GASMOCQ + GASMOPQ +
      PERSCACQ + PERSCAPQ + TOBACCCQ + TOBACCPQ +
      MISCCQ + MISCPQ,
    
    EX_N = EX_SN +
      APPARCQ + APPARPQ + HEALTHCQ + HEALTHPQ +
      READCQ + READPQ,
    
    EX_T = TOTEXPCQ + TOTEXPPQ,
    
    # Sub-categories - food
    EX_FD_HM = FDHOMECQ + FDHOMEPQ, # food at home
    EX_FD_AW = FDAWAYCQ + FDAWAYPQ, # food away from home 
    EX_ALC = ALCBEVCQ + ALCBEVPQ, # alcholic beverages 
    
    # Sub-categories - Strict non-durables
    EX_UT_HO = UTILCQ + UTILPQ + HOUSOPCQ + HOUSOPPQ, # utility and household operations 
    EX_PC_MIS = PERSCACQ + PERSCAPQ + MISCCQ + MISCPQ, # personal care and miscellaneous 
    EX_TR_GAS = PUBTRACQ + PUBTRAPQ + GASMOCQ + GASMOPQ, # public transportation, gas, and motor oil
    EX_TBC = TOBACCCQ + TOBACCPQ, # tobacco
    
    # Sub-categories - Non-durables 
    EX_APR = APPARCQ + APPARPQ, # apparel 
    EX_HLT = HEALTHCQ + HEALTHPQ, # health
    EX_READ = READCQ + READPQ, # reading materials
    
    # Sub-categories - Total
    EX_HS = HOUSCQ + HOUSPQ, # housing 
    EX_EDU = EDUCACQ + EDUCAPQ, # education 
    EX_ENT = ENTERTCQ + ENTERTPQ, # entertainment 
    EX_TRANS = TRANSCQ + TRANSPQ, # transportation
    EX_CACT = CASHCOCQ + CASHCOPQ # cash contributions
  ) %>% 
    
  #### Create lagged EIPI variables ####
  # Default is set to zero since there is no EIPI disbursed before April 2020
  # So for all interviews before June (no May cycle CUs are included), lag is zero
  # For all interviews after June, lag will be decided based on the previous interview
  group_by(ID) %>%
    arrange(YYMM, .by_group = TRUE) %>%
    mutate(EIPI_tm1 = lag(EIPI_t, n=1, default=0),
           EIPI_by_ck_tm1 = lag(EIPI_by_ck_t, n=1, default=0),
           EIPI_by_dd_tm1 = lag(EIPI_by_dd_t, n=1, default=0),
           EIPI_by_dc_tm1 = lag(EIPI_by_dc_t, n=1, default=0),
           EIPI_for_ep_tm1 = lag(EIPI_for_ep_t, n=1, default=0),
           EIPI_for_debt_tm1 = lag(EIPI_for_debt_t, n=1, default=0),
           EIPI_for_sv_tm1 = lag(EIPI_for_sv_t, n=1, default=0),
           
           iEIPI_tm1 = lag(iEIPI_t, n=1, default=0),
           iEIPI_by_ck_tm1 = lag(iEIPI_by_ck_t, n=1, default=0),
           iEIPI_by_dd_tm1 = lag(iEIPI_by_dd_t, n=1, default=0),
           iEIPI_by_dc_tm1 = lag(iEIPI_by_dc_t, n=1, default=0),
           iEIPI_for_ep_tm1 = lag(iEIPI_for_ep_t, n=1, default=0),
           iEIPI_for_debt_tm1 = lag(iEIPI_for_debt_t, n=1, default=0),
           iEIPI_for_sv_tm1 = lag(iEIPI_for_sv_t, n=1, default=0),
           
           EIPI_apr_tm1 = lag(EIPI_apr_t, n=1, default=0),
           EIPI_may_tm1 = lag(EIPI_may_t, n=1, default=0),
           EIPI_jun_tm1 = lag(EIPI_jun_t, n=1, default=0),
           EIPI_jul_tm1 = lag(EIPI_jul_t, n=1, default=0),
           EIPI_aug_tm1 = lag(EIPI_aug_t, n=1, default=0),
           EIPI_sep_tm1 = lag(EIPI_sep_t, n=1, default=0),
           EIPI_oct_tm1 = lag(EIPI_oct_t, n=1, default=0),
           EIPI_nov_tm1 = lag(EIPI_nov_t, n=1, default=0),
           
           EIPI_tm2 = lag(EIPI_t, n=2, default=0),
           EIPI_by_ck_tm2 = lag(EIPI_by_ck_t, n=2, default=0),
           EIPI_by_dd_tm2 = lag(EIPI_by_dd_t, n=2, default=0),
           EIPI_by_dc_tm2 = lag(EIPI_by_dc_t, n=2, default=0),
           EIPI_for_ep_tm2 = lag(EIPI_for_ep_t, n=2, default=0),
           EIPI_for_debt_tm2 = lag(EIPI_for_debt_t, n=2, default=0),
           EIPI_for_sv_tm2 = lag(EIPI_for_sv_t, n=2, default=0),
           
           iEIPI_tm2 = lag(iEIPI_t, n=2, default=0),
           iEIPI_by_ck_tm2 = lag(iEIPI_by_ck_t, n=2, default=0),
           iEIPI_by_dd_tm2 = lag(iEIPI_by_dd_t, n=2, default=0),
           iEIPI_by_dc_tm2 = lag(iEIPI_by_dc_t, n=2, default=0),
           iEIPI_for_ep_tm2 = lag(iEIPI_for_ep_t, n=2, default=0),
           iEIPI_for_debt_tm2 = lag(iEIPI_for_debt_t, n=2, default=0),
           iEIPI_for_sv_tm2 = lag(iEIPI_for_sv_t, n=2, default=0),
           
           EIPI_apr_tm2 = lag(EIPI_apr_t, n=2, default=0),
           EIPI_may_tm2 = lag(EIPI_may_t, n=2, default=0),
           EIPI_jun_tm2 = lag(EIPI_jun_t, n=2, default=0),
           EIPI_jul_tm2 = lag(EIPI_jul_t, n=2, default=0),
           EIPI_aug_tm2 = lag(EIPI_aug_t, n=2, default=0),
           EIPI_sep_tm2 = lag(EIPI_sep_t, n=2, default=0),
           EIPI_oct_tm2 = lag(EIPI_oct_t, n=2, default=0),
           EIPI_nov_tm2 = lag(EIPI_nov_t, n=2, default=0)) %>%
    
    select(
      ID, NEWID, YYMM, INTERI,
      
      # Demographics
      NUM_KIDS,NUM_ADTS, FAM_SIZE, AGE_REF, AGE2, AGE_AVG, SEX_REF,MARITAL1,CUTENURE,
      
      # Spendings
      EX_FD,EX_SN,EX_N,EX_T,EX_FD_HM,EX_FD_AW,EX_ALC,
      EX_UT_HO,EX_PC_MIS,EX_TR_GAS,EX_TBC,EX_APR,EX_HLT,EX_READ,
      EX_HS,EX_EDU,EX_ENT,EX_TRANS,EX_CACT,
      
      # EIPIs
      EIPI_t, EIPI_by_ck_t, EIPI_by_dd_t, EIPI_by_dc_t,
      EIPI_for_ep_t, EIPI_for_debt_t, EIPI_for_sv_t, iEIPI_t, iEIPI_by_ck_t, iEIPI_by_dd_t , iEIPI_by_dc_t,
      iEIPI_for_ep_t, iEIPI_for_debt_t, iEIPI_for_sv_t, 
      EIPI_apr_t,EIPI_may_t,EIPI_jun_t,EIPI_jul_t,EIPI_aug_t,EIPI_sep_t,EIPI_oct_t,EIPI_nov_t, EIPI_t_count,
      
      EIPI_tm1, EIPI_by_ck_tm1, EIPI_by_dd_tm1, EIPI_by_dc_tm1,
      EIPI_for_ep_tm1, EIPI_for_debt_tm1, EIPI_for_sv_tm1, iEIPI_tm1, iEIPI_by_ck_tm1, iEIPI_by_dd_tm1, iEIPI_by_dc_tm1,
      iEIPI_for_ep_tm1, iEIPI_for_debt_tm1, iEIPI_for_sv_tm1, 
      EIPI_apr_tm1,EIPI_may_tm1,EIPI_jun_tm1,EIPI_jul_tm1,EIPI_aug_tm1,EIPI_sep_tm1,EIPI_oct_tm1,EIPI_nov_tm1,
      
      EIPI_tm2, EIPI_by_ck_tm2, EIPI_by_dd_tm2, EIPI_by_dc_tm2,
      EIPI_for_ep_tm2, EIPI_for_debt_tm2, EIPI_for_sv_tm2, iEIPI_tm2, iEIPI_by_ck_tm2, iEIPI_by_dd_tm2, iEIPI_by_dc_tm2,
      iEIPI_for_ep_tm2, iEIPI_for_debt_tm2, iEIPI_for_sv_tm2, 
      EIPI_apr_tm2,EIPI_may_tm2,EIPI_jun_tm2,EIPI_jul_tm2,EIPI_aug_tm2,EIPI_sep_tm2,EIPI_oct_tm2,EIPI_nov_tm2) 
  
  #### Create group dummies ####
  df <- df %>% mutate(
    r = ifelse(ID %in% list_r$ID, 1, 0), # dummy for recipient 
    ck = ifelse(ID %in% list_ck$ID, 1, 0), # dummy for receiving EIPII only by check 
    dd = ifelse(ID %in% list_dd$ID, 1, 0), # dummy for receiving EIPII only by direct deposit
    dc = ifelse(ID %in% list_dc$ID, 1, 0), # dummy for receiving EIPII only by debit card 
    
    # Assign NAs 
    ck = ifelse((r == 1 & !(ID %in% list_ck$ID) & !(ID %in% list_dd$ID) & !(ID %in% list_dc$ID)), NA, ck),
    dd = ifelse((r == 1 & !(ID %in% list_ck$ID) & !(ID %in% list_dd$ID) & !(ID %in% list_dc$ID)), NA, dd),
    dc = ifelse((r == 1 & !(ID %in% list_ck$ID) & !(ID %in% list_dd$ID) & !(ID %in% list_dc$ID)), NA, dc),
    
    # Combinations
    
    ck_dd = ifelse(ck==1 & dd==1,1,0), # dummy for receiving EIPII by both check and direct deposit 
    ck_dc = ifelse(ck==1 & dc==1,1,0), # dummy for receiving EIPII by both check and debit card 
    dd_dc = ifelse(dd==1 & dc==1,1,0), # dummy for receiving EIPII by both direct deposit and debit card
    ck_dd_dc = ifelse(ck==1 & dd==1 & dc==1,1,0),# dummy for receiving EIPII by all methods
    
    # Adjusting ck and dd to 0 if ck_dd is 1
    ck = ifelse(ck_dd == 1, 0, ck),
    dd = ifelse(ck_dd == 1, 0, dd),
    
    # similar to above
    ck = ifelse(ck_dc == 1, 0, ck),
    dc = ifelse(ck_dc == 1, 0, dc),
    
    dd = ifelse(dd_dc == 1, 0, dd),
    dc = ifelse(dd_dc == 1, 0, dc),
    
    ck = ifelse(ck_dd_dc == 1, 0, ck),
    dd = ifelse(ck_dd_dc == 1, 0, dd),
    dc = ifelse(ck_dd_dc == 1, 0, dc),
    
    ck_dd = ifelse(ck_dd_dc==1, 0, ck_dd),
    ck_dc = ifelse(ck_dd_dc==1, 0, ck_dc),
    dd_dc = ifelse(ck_dd_dc==1, 0, dd_dc),
    
    # usage
    ep = ifelse(ID %in% list_ep$ID, 1, 0), # dummy for only using EIPII "mostly for expenses" 
    debt = ifelse(ID %in% list_debt$ID, 1, 0), # dummy for using EIPII "mostly to pay off debt"
    sv = ifelse(ID %in% list_sv$ID, 1, 0), # dummy for using EIPII "mostly to add to savings"
    
    # Assign NAs 
    ep = ifelse((r == 1 & !(ID %in% list_ep$ID) & !(ID %in% list_debt$ID) & !(ID %in% list_sv$ID)), NA, ep),
    debt = ifelse((r == 1 & !(ID %in% list_ep$ID) & !(ID %in% list_debt$ID) & !(ID %in% list_sv$ID)), NA, debt),
    sv = ifelse((r == 1 & !(ID %in% list_ep$ID) & !(ID %in% list_debt$ID) & !(ID %in% list_sv$ID)), NA, sv),
    
    # Combinations
    ep_debt = ifelse(ep==1 & debt==1,1,0), # dummy for using EIPII "mostly " for both "expenses" and "debt"
    ep_sv = ifelse(ep==1 & sv==1,1,0), # dummy for using EIPII "mostly" for both "expenses" and "savings" 
    debt_sv = ifelse(debt==1 & sv==1,1,0), # dummy for using EIPII  "mostly" for both "debt" and "savings" 
    ep_debt_sv = ifelse(debt==1 & sv==1 & ep ==1,1,0), # dummy for using EIPII "mostly" for all three purposes
    
    # Adjusting ep and debt to 0 if ep_debt is 1
    ep = ifelse(ep_debt == 1, 0 , ep),
    debt = ifelse(ep_debt == 1, 0 , debt),
    # similar to above
    ep = ifelse(ep_sv == 1, 0 , ep),
    sv = ifelse(ep_sv == 1, 0 , sv),
    
    debt = ifelse(debt_sv == 1, 0 , debt),
    sv = ifelse(debt_sv == 1, 0 , sv),
    
    ep = ifelse(ep_debt_sv == 1, 0 , ep),
    debt = ifelse(ep_debt_sv == 1, 0 , debt),
    sv = ifelse(ep_debt_sv == 1, 0 , sv),
    
    ep_debt = ifelse(ep_debt_sv==1,0,ep_debt),
    ep_sv = ifelse(ep_debt_sv==1,0,ep_sv),
    debt_sv = ifelse(ep_debt_sv==1,0,debt_sv))
  
  return(df)
}

df <- df_modifier(df)

### 3 Panel with first difference ####

#### 3.1 Obtain and compute all variables of interest from fmli ####
# fmli_modifier is similar to df_modifier
# It selects all the relevant CE variables and construct variables used in the study
fmli_modifier <- function(fmli){
  fmli <- fmli %>% select(
    # interview info
    NEWID, YYMM, INTERI,
    # Demographics
    PERSLT18, FAM_SIZE, AGE_REF, AGE2, SEX_REF,MARITAL1,CUTENURE,
    
    # Food expenditure
    FDAWAYCQ, FDAWAYPQ, FDHOMECQ,FDHOMEPQ,
    FOODCQ, FOODPQ, ALCBEVCQ, ALCBEVPQ, 
    
    # Strict non-durables expenditure
    UTILCQ, UTILPQ, HOUSOPCQ, HOUSOPPQ,
    PUBTRACQ, PUBTRAPQ, GASMOCQ, GASMOPQ,
    PERSCACQ, PERSCAPQ, TOBACCCQ, TOBACCPQ,
    MISCCQ, MISCPQ, 
    
    # Non-durables expenditure
    APPARCQ, APPARPQ,
    HEALTHCQ, HEALTHPQ, READCQ, READPQ,
    TOTEXPCQ, TOTEXPPQ, 
    
    # Total expenditure
    HOUSCQ, HOUSPQ, EDUCACQ, EDUCAPQ,
    ENTERTCQ, ENTERTPQ, TRANSCQ, TRANSPQ,
    CASHCOCQ, CASHCOPQ) %>% 
    
    mutate(
      ID = substr(as.character(NEWID),1,6),
      
      # Demogpraphics
      NUM_KIDS = PERSLT18,
      NUM_ADTS = FAM_SIZE - PERSLT18,
      AGE_AVG = ifelse(is.na(AGE2),AGE_REF,(AGE_REF + AGE2)/2),
      
      # Four big categories
      EX_FD = FOODCQ + FOODPQ + ALCBEVCQ + ALCBEVPQ,
      
      EX_SN = FOODCQ + FOODPQ + ALCBEVCQ + ALCBEVPQ +
        UTILCQ + UTILPQ + HOUSOPCQ + HOUSOPPQ +
        PUBTRACQ + PUBTRAPQ + GASMOCQ + GASMOPQ +
        PERSCACQ + PERSCAPQ + TOBACCCQ + TOBACCPQ +
        MISCCQ + MISCPQ,
      
      EX_N = EX_SN +
        APPARCQ + APPARPQ + HEALTHCQ + HEALTHPQ +
        READCQ + READPQ,
      
      EX_T = TOTEXPCQ + TOTEXPPQ,
      
      # Sub-categories - food
      EX_FD_HM = FDHOMECQ + FDHOMEPQ,
      EX_FD_AW = FDAWAYCQ + FDAWAYPQ,
      EX_ALC = ALCBEVCQ + ALCBEVPQ,
      
      # Sub-categories - Strict non-durables
      EX_UT_HO = UTILCQ + UTILPQ + HOUSOPCQ + HOUSOPPQ,
      EX_PC_MIS = PERSCACQ + PERSCAPQ + MISCCQ + MISCPQ,
      EX_TR_GAS = PUBTRACQ + PUBTRAPQ + GASMOCQ + GASMOPQ,
      EX_TBC = TOBACCCQ + TOBACCPQ,
      
      # Sub-categories - Non-durables 
      EX_APR = APPARCQ + APPARPQ,
      EX_HLT = HEALTHCQ + HEALTHPQ,
      EX_READ = READCQ + READPQ,
      
      # Sub-categories - Total
      EX_HS = HOUSCQ + HOUSPQ,
      EX_EDU = EDUCACQ + EDUCAPQ,
      EX_ENT = ENTERTCQ + ENTERTPQ,
      EX_TRANS = TRANSCQ + TRANSPQ,
      EX_CACT = CASHCOCQ + CASHCOPQ
    ) %>% 
    
    select(
      ID,
      
      NUM_KIDS,NUM_ADTS, FAM_SIZE, AGE_REF, AGE2, AGE_AVG, SEX_REF, MARITAL1, CUTENURE,
      
      EX_FD,EX_SN,EX_N,EX_T,EX_FD_HM,EX_FD_AW,EX_ALC,
      EX_UT_HO,EX_PC_MIS,EX_TR_GAS,EX_TBC,EX_APR,EX_HLT,EX_READ,
      EX_HS,EX_EDU,EX_ENT,EX_TRANS,EX_CACT)
  
  return(fmli)
}

# Apply the function
fmli193 <- fmli_modifier(fmli193)
fmli194 <- fmli_modifier(fmli194)
fmli201 <- fmli_modifier(fmli201)
fmli202 <- fmli_modifier(fmli202)
fmli203 <- fmli_modifier(fmli203)
fmli204 <- fmli_modifier(fmli204)
fmli211 <- fmli_modifier(fmli211)

#### 3.2 Computing first difference ####
# diff_maker merge selected df interviews with the their previous interviews
# It renames the variables by time (t and tm1), and computes first difference

diff_maker <- function(df,fmli_tm1){
  # merge certain interviews with precious interviews 
  df <- merge(df,fmli_tm1,by="ID")
  # rename variables by time
  df <- df %>% rename(
    
    FAM_SIZE_t = FAM_SIZE.x, AGE_REF_t = AGE_REF.x, AGE2_t = AGE2.x, AGE_AVG_t = AGE_AVG.x,
    SEX_REF_t = SEX_REF.x, MARITAL1_t = MARITAL1.x, NUM_ADTS_t = NUM_ADTS.x,
    NUM_KIDS_t = NUM_KIDS.x, CUTENURE_t = CUTENURE.x,
    
    FAM_SIZE_tm1 = FAM_SIZE.y, AGE_REF_tm1 = AGE_REF.y, AGE2_tm1 = AGE2.y, AGE_AVG_tm1 = AGE_AVG.y,
    SEX_REF_tm1 = SEX_REF.y, MARITAL1_tm1 = MARITAL1.y, NUM_ADTS_tm1 = NUM_ADTS.y,
    NUM_KIDS_tm1 = NUM_KIDS.y, CUTENURE_tm1 = CUTENURE.y,
    
    EX_FD_t = EX_FD.x, EX_SN_t = EX_SN.x, EX_N_t = EX_N.x, EX_T_t = EX_T.x,
    EX_FD_HM_t = EX_FD_HM.x, EX_FD_AW_t = EX_FD_AW.x, EX_ALC_t = EX_ALC.x,
    EX_UT_HO_t = EX_UT_HO.x, EX_PC_MIS_t = EX_PC_MIS.x, EX_TR_GAS_t = EX_TR_GAS.x,
    EX_TBC_t = EX_TBC.x, EX_APR_t = EX_APR.x, EX_HLT_t = EX_HLT.x, EX_READ_t = EX_READ.x,
    EX_HS_t = EX_HS.x, EX_EDU_t = EX_EDU.x, EX_ENT_t = EX_ENT.x, EX_TRANS_t = EX_TRANS.x,
    EX_CACT_t = EX_CACT.x, 
    
    EX_FD_tm1 = EX_FD.y, EX_SN_tm1 = EX_SN.y, EX_N_tm1 = EX_N.y, EX_T_tm1 = EX_T.y,
    EX_FD_HM_tm1 = EX_FD_HM.y, EX_FD_AW_tm1 = EX_FD_AW.y, EX_ALC_tm1 = EX_ALC.y,
    EX_UT_HO_tm1 = EX_UT_HO.y, EX_PC_MIS_tm1 = EX_PC_MIS.y, EX_TR_GAS_tm1 = EX_TR_GAS.y,
    EX_TBC_tm1 = EX_TBC.y, EX_APR_tm1 = EX_APR.y, EX_HLT_tm1 = EX_HLT.y, EX_READ_tm1 = EX_READ.y,
    EX_HS_tm1 = EX_HS.y, EX_EDU_tm1 = EX_EDU.y, EX_ENT_tm1 = EX_ENT.y, EX_TRANS_tm1 = EX_TRANS.y,
    EX_CACT_tm1 = EX_CACT.y) %>% 
    
    # Computing first difference 
    mutate(
      # change in family size
      d_FAM_SIZE_t = FAM_SIZE_t - FAM_SIZE_tm1,
      d_NUM_ADTS_t = NUM_ADTS_t - NUM_ADTS_tm1,
      d_NUM_KIDS_t = NUM_KIDS_t - NUM_KIDS_tm1,
      
      # change in age
      d_AGE_REF_t = AGE_REF_t - AGE_REF_tm1,
      d_AGE_2 = AGE2_t - AGE2_tm1,
      
      # 4 main expenditure categories
      d_EX_FD_t = as.numeric(format(round(EX_FD_t - EX_FD_tm1,1),scientific=F)),
      d_EX_SN_t = as.numeric(format(round(EX_SN_t - EX_SN_tm1,1),scientific=F)),
      d_EX_N_t = as.numeric(format(round(EX_N_t - EX_N_tm1,1),scientific=F)),
      d_EX_T_t = as.numeric(format(round(EX_T_t - EX_T_tm1,1),scientific=F)),
      
      # Food sub-category 
      d_EX_FD_HM_t = as.numeric(format(round(EX_FD_HM_t - EX_FD_HM_tm1,1),scientific=F)),
      d_EX_FD_AW_t = as.numeric(format(round(EX_FD_AW_t - EX_FD_AW_tm1,1),scientific=F)),
      d_EX_ALC_t = as.numeric(format(round(EX_ALC_t - EX_ALC_tm1,1),scientific=F)),
      
      # Strict Non-durables
      d_EX_UT_HO_t = as.numeric(format(round(EX_UT_HO_t - EX_UT_HO_tm1,1),scientific=F)),
      d_EX_PC_MIS_t = as.numeric(format(round(EX_PC_MIS_t - EX_PC_MIS_tm1,1),scientific=F)),
      d_EX_TR_GAS_t = as.numeric(format(round(EX_TR_GAS_t - EX_TR_GAS_tm1,1),scientific=F)),
      d_EX_TBC_t = as.numeric(format(round(EX_TBC_t - EX_TBC_tm1,1),scientific=F)),  
      
      # Non-durables
      d_EX_APR_t = as.numeric(format(round(EX_APR_t - EX_APR_tm1,1),scientific=F)),
      d_EX_HLT_t = as.numeric(format(round(EX_HLT_t - EX_HLT_tm1,1),scientific=F)),
      d_EX_READ_t = as.numeric(format(round(EX_READ_t - EX_READ_tm1,1),scientific=F)), 
      
      # Total
      d_EX_HS_t = as.numeric(format(round(EX_HS_t - EX_HS_tm1,1),scientific=F)),
      d_EX_EDU_t = as.numeric(format(round(EX_EDU_t - EX_EDU_tm1,1),scientific=F)),
      d_EX_ENT_t = as.numeric(format(round(EX_ENT_t - EX_ENT_tm1,1),scientific=F)),
      d_EX_TRANS_t = as.numeric(format(round(EX_TRANS_t - EX_TRANS_tm1,1),scientific=F)), 
      d_EX_CACT_t = as.numeric(format(round(EX_CACT_t - EX_CACT_tm1,1),scientific=F))
    )
  
  return(df)
}

#### 3.3 Incorporating first difference into df ####
# To increase coding efficiency, we reduce df to df by quarter
df_19q4 <- df %>% filter(YYMM==1910|YYMM==1911|YYMM==1912)
df_20q1 <- df %>% filter(YYMM==2001|YYMM==2002|YYMM==2003)
df_20q2 <- df %>% filter(YYMM==2004|YYMM==2005|YYMM==2006)
df_20q3 <- df %>% filter(YYMM==2007|YYMM==2008|YYMM==2009)
df_20q4 <- df %>% filter(YYMM==2010|YYMM==2011|YYMM==2012)
df_21q1 <- df %>% filter(YYMM==2101)

# Each df by quarter are merged with the corresponding interviews in the last quarter 
df_19q4 <- diff_maker(df_19q4, fmli193)
df_20q1 <- diff_maker(df_20q1, fmli194)
df_20q2 <- diff_maker(df_20q2, fmli201)
df_20q3 <- diff_maker(df_20q3, fmli202)
df_20q4 <- diff_maker(df_20q4, fmli203)
df_21q1 <- diff_maker(df_21q1, fmli204)

# Binding df by quarters to form a complete df
df <- bind_rows(df_19q4, df_20q1, df_20q2, df_20q3, df_20q4, df_21q1)
# note that after this there is no rebates received later than two September rebates
# we will drop EIPI_oct, EIPI_nov, and EIPI_dec variables later

df <- df %>% 
  arrange(ID) %>% 
  group_by(ID) %>%
  arrange(YYMM, .by_group = TRUE) %>%
  ungroup()

### 4 Computing average weights, the first documented income, and liquidity ####
# wts_inc_liq_creator extract weights, income, and liquidity from fmli files
wts_inc_liq_creator <- function(fmli){
  fmli_weights_income <- fmli %>% select(NEWID, FINLWT21,FINCBTXM,LIQUDYR,LIQUDYRX) %>% 
    mutate(
      ID = substr(as.character(NEWID),1,6),
      # if CU reports no liquid accounts, then liquidity is set to 0
      LIQUDYRX = ifelse(LIQUDYR==2 & !is.na(LIQUDYR),0,LIQUDYRX)
    ) %>% select(ID, FINLWT21,FINCBTXM,LIQUDYRX)
  return(fmli_weights_income)}

# Apply wts_inc_liq_creator to each fmli file
fmli193_wts_inc_liq <- wts_inc_liq_creator(fmli193_copy)
fmli194_wts_inc_liq <- wts_inc_liq_creator(fmli194_copy)
fmli201_wts_inc_liq <- wts_inc_liq_creator(fmli201_copy)
fmli202_wts_inc_liq <- wts_inc_liq_creator(fmli202_copy)
fmli203_wts_inc_liq <- wts_inc_liq_creator(fmli203_copy)
fmli204_wts_inc_liq <- wts_inc_liq_creator(fmli204_copy)
fmli211_wts_inc_liq <- wts_inc_liq_creator(fmli211_copy) 
fmli212_wts_inc_liq <- wts_inc_liq_creator(fmli212_copy) # only used for liquidity

# Merge to obtain the weights, income, and liquidity in each interview 
wts_inc_liq <- merge(fmli193_wts_inc_liq,fmli194_wts_inc_liq,by="ID",all=TRUE)

wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_193 = FINLWT21.x, FINLWT21_194 = FINLWT21.y, FINCBTXM_193 = FINCBTXM.x,
  FINCBTXM_194 = FINCBTXM.y, LIQUDYRX_193 = LIQUDYRX.x, LIQUDYRX_194 = LIQUDYRX.y)

wts_inc_liq <- merge(wts_inc_liq,fmli201_wts_inc_liq, by="ID", all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_201 = FINLWT21, FINCBTXM_201 = FINCBTXM, LIQUDYRX_201 = LIQUDYRX)

wts_inc_liq <- merge(wts_inc_liq,fmli202_wts_inc_liq,by="ID",all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_202 = FINLWT21, FINCBTXM_202 = FINCBTXM, LIQUDYRX_202 = LIQUDYRX)

wts_inc_liq <- merge(wts_inc_liq,fmli203_wts_inc_liq,by="ID",all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_203 = FINLWT21, FINCBTXM_203 = FINCBTXM, LIQUDYRX_203 = LIQUDYRX)

wts_inc_liq <- merge(wts_inc_liq,fmli204_wts_inc_liq,by="ID",all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_204 = FINLWT21, FINCBTXM_204 = FINCBTXM, LIQUDYRX_204 = LIQUDYRX)

wts_inc_liq <- merge(wts_inc_liq,fmli211_wts_inc_liq,by="ID",all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_211 = FINLWT21, FINCBTXM_211 = FINCBTXM, LIQUDYRX_211 = LIQUDYRX)

wts_inc_liq <- merge(wts_inc_liq,fmli212_wts_inc_liq,by="ID",all=TRUE)
wts_inc_liq <- wts_inc_liq %>% rename(
  FINLWT21_212 = FINLWT21, FINCBTXM_212 = FINCBTXM, LIQUDYRX_212 = LIQUDYRX)

# Average weights 
# no FINLWT21_212 because fmli212 is only used for liquidity 
wts_inc_liq$FINLWT21_AVG <- rowMeans(wts_inc_liq[,c("FINLWT21_193",
                                                    "FINLWT21_194",
                                                    "FINLWT21_201",
                                                    "FINLWT21_202",
                                                    "FINLWT21_203",
                                                    "FINLWT21_204",
                                                    "FINLWT21_211")], 
                                     na.rm=TRUE)

# first income 
# no FINLWT21_212 because fmli212  is only used for liquidity 
wts_inc_liq$FINCBTXM_FST <- ifelse(!is.na(wts_inc_liq$FINCBTXM_193),wts_inc_liq$FINCBTXM_193,
                                   ifelse(!is.na(wts_inc_liq$FINCBTXM_194),wts_inc_liq$FINCBTXM_194,
                                          ifelse(!is.na(wts_inc_liq$FINCBTXM_201),wts_inc_liq$FINCBTXM_201,
                                                 ifelse(!is.na(wts_inc_liq$FINCBTXM_202),wts_inc_liq$FINCBTXM_202,
                                                        ifelse(!is.na(wts_inc_liq$FINCBTXM_203),wts_inc_liq$FINCBTXM_203,
                                                               ifelse(!is.na(wts_inc_liq$FINCBTXM_204),wts_inc_liq$FINCBTXM_204,wts_inc_liq$FINCBTXM_211))))))
# liquidity 
wts_inc_liq$LIQUDYRX <- ifelse(!is.na(wts_inc_liq$LIQUDYRX_193),wts_inc_liq$LIQUDYRX_193,
                               ifelse(!is.na(wts_inc_liq$LIQUDYRX_194),wts_inc_liq$LIQUDYRX_194,
                                      ifelse(!is.na(wts_inc_liq$LIQUDYRX_201),wts_inc_liq$LIQUDYRX_201,
                                             ifelse(!is.na(wts_inc_liq$LIQUDYRX_202),wts_inc_liq$LIQUDYRX_202,
                                                    ifelse(!is.na(wts_inc_liq$LIQUDYRX_203),wts_inc_liq$LIQUDYRX_203,
                                                           ifelse(!is.na(wts_inc_liq$LIQUDYRX_204),wts_inc_liq$LIQUDYRX_204,
                                                                  ifelse(!is.na(wts_inc_liq$LIQUDYRX_211),wts_inc_liq$LIQUDYRX_211,wts_inc_liq$LIQUDYRX_212)))))))

wts_inc_liq <- wts_inc_liq %>% select(ID,FINLWT21_AVG,FINCBTXM_FST,LIQUDYRX)

### 5 Computing average expenditure (for scaling) ####

#### 5.1 fmli_expd_creator select expd variables ####

fmli_expd_creator <- function(fmli){
  # Only the expd variables
  fmli <- fmli %>% select(1,11:29)
  return(fmli)
}

# Apply to each fmli
fmli193_expd <- fmli_expd_creator(fmli193)
fmli194_expd <- fmli_expd_creator(fmli194)
fmli201_expd <- fmli_expd_creator(fmli201)
fmli202_expd <- fmli_expd_creator(fmli202)
fmli203_expd <- fmli_expd_creator(fmli203)
fmli204_expd <- fmli_expd_creator(fmli204)
fmli211_expd <- fmli_expd_creator(fmli211) 

#### 5.2 Merge expd in all periods ####
expd <- merge(fmli193_expd,fmli194_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_193 = EX_FD.x, EX_SN_193 = EX_SN.x, EX_N_193 = EX_N.x, EX_T_193 = EX_T.x,
  EX_FD_HM_193 = EX_FD_HM.x, EX_FD_AW_193 = EX_FD_AW.x, EX_ALC_193 = EX_ALC.x,
  EX_UT_HO_193 = EX_UT_HO.x, EX_PC_MIS_193 = EX_PC_MIS.x, EX_TR_GAS_193 = EX_TR_GAS.x,
  EX_TBC_193 = EX_TBC.x, EX_APR_193 = EX_APR.x, EX_HLT_193 = EX_HLT.x, EX_READ_193 = EX_READ.x,
  EX_HS_193 = EX_HS.x, EX_EDU_193 = EX_EDU.x, EX_ENT_193 = EX_ENT.x, EX_TRANS_193 = EX_TRANS.x,
  EX_CACT_193 = EX_CACT.x,
  
  EX_FD_194 = EX_FD.y, EX_SN_194 = EX_SN.y, EX_N_194 = EX_N.y, EX_T_194 = EX_T.y,
  EX_FD_HM_194 = EX_FD_HM.y, EX_FD_AW_194 = EX_FD_AW.y, EX_ALC_194 = EX_ALC.y,
  EX_UT_HO_194 = EX_UT_HO.y, EX_PC_MIS_194 = EX_PC_MIS.y, EX_TR_GAS_194 = EX_TR_GAS.y,
  EX_TBC_194 = EX_TBC.y, EX_APR_194 = EX_APR.y, EX_HLT_194 = EX_HLT.y, EX_READ_194 = EX_READ.y,
  EX_HS_194 = EX_HS.y, EX_EDU_194 = EX_EDU.y, EX_ENT_194 = EX_ENT.y, EX_TRANS_194 = EX_TRANS.y,
  EX_CACT_194 = EX_CACT.y)

# merge expd with fmli201
expd <- merge(expd,fmli201_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_201 = EX_FD, EX_SN_201 = EX_SN, EX_N_201 = EX_N, EX_T_201 = EX_T,
  EX_FD_HM_201 = EX_FD_HM, EX_FD_AW_201 = EX_FD_AW, EX_ALC_201 = EX_ALC,
  EX_UT_HO_201 = EX_UT_HO, EX_PC_MIS_201 = EX_PC_MIS, EX_TR_GAS_201 = EX_TR_GAS,
  EX_TBC_201 = EX_TBC, EX_APR_201 = EX_APR, EX_HLT_201 = EX_HLT, EX_READ_201 = EX_READ,
  EX_HS_201 = EX_HS, EX_EDU_201 = EX_EDU, EX_ENT_201 = EX_ENT, EX_TRANS_201 = EX_TRANS,
  EX_CACT_201 = EX_CACT)

# merge expd with fmli202
expd <- merge(expd,fmli202_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_202 = EX_FD, EX_SN_202 = EX_SN, EX_N_202 = EX_N, EX_T_202 = EX_T,
  EX_FD_HM_202 = EX_FD_HM, EX_FD_AW_202 = EX_FD_AW, EX_ALC_202 = EX_ALC,
  EX_UT_HO_202 = EX_UT_HO, EX_PC_MIS_202 = EX_PC_MIS, EX_TR_GAS_202 = EX_TR_GAS,
  EX_TBC_202 = EX_TBC, EX_APR_202 = EX_APR, EX_HLT_202 = EX_HLT, EX_READ_202 = EX_READ,
  EX_HS_202 = EX_HS, EX_EDU_202 = EX_EDU, EX_ENT_202 = EX_ENT, EX_TRANS_202 = EX_TRANS,
  EX_CACT_202 = EX_CACT)

# merge expd with fmli203
expd <- merge(expd,fmli203_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_203 = EX_FD, EX_SN_203 = EX_SN, EX_N_203 = EX_N, EX_T_203 = EX_T,
  EX_FD_HM_203 = EX_FD_HM, EX_FD_AW_203 = EX_FD_AW, EX_ALC_203 = EX_ALC,
  EX_UT_HO_203 = EX_UT_HO, EX_PC_MIS_203 = EX_PC_MIS, EX_TR_GAS_203 = EX_TR_GAS,
  EX_TBC_203 = EX_TBC, EX_APR_203 = EX_APR, EX_HLT_203 = EX_HLT, EX_READ_203 = EX_READ,
  EX_HS_203 = EX_HS, EX_EDU_203 = EX_EDU, EX_ENT_203 = EX_ENT, EX_TRANS_203 = EX_TRANS,
  EX_CACT_203 = EX_CACT)

# merge expd with fmli204
expd <- merge(expd,fmli204_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_204 = EX_FD, EX_SN_204 = EX_SN, EX_N_204 = EX_N, EX_T_204 = EX_T,
  EX_FD_HM_204 = EX_FD_HM, EX_FD_AW_204 = EX_FD_AW, EX_ALC_204 = EX_ALC,
  EX_UT_HO_204 = EX_UT_HO, EX_PC_MIS_204 = EX_PC_MIS, EX_TR_GAS_204 = EX_TR_GAS,
  EX_TBC_204 = EX_TBC, EX_APR_204 = EX_APR, EX_HLT_204 = EX_HLT, EX_READ_204 = EX_READ,
  EX_HS_204 = EX_HS, EX_EDU_204 = EX_EDU, EX_ENT_204 = EX_ENT, EX_TRANS_204 = EX_TRANS,
  EX_CACT_204 = EX_CACT)

# merge expd with fmli211
expd <- merge(expd,fmli211_expd,by="ID",all=TRUE)

expd <- expd %>% rename(
  EX_FD_211 = EX_FD, EX_SN_211 = EX_SN, EX_N_211 = EX_N, EX_T_211 = EX_T,
  EX_FD_HM_211 = EX_FD_HM, EX_FD_AW_211 = EX_FD_AW, EX_ALC_211 = EX_ALC,
  EX_UT_HO_211 = EX_UT_HO, EX_PC_MIS_211 = EX_PC_MIS, EX_TR_GAS_211 = EX_TR_GAS,
  EX_TBC_211 = EX_TBC, EX_APR_211 = EX_APR, EX_HLT_211 = EX_HLT, EX_READ_211 = EX_READ,
  EX_HS_211 = EX_HS, EX_EDU_211 = EX_EDU, EX_ENT_211 = EX_ENT, EX_TRANS_211 = EX_TRANS,
  EX_CACT_211 = EX_CACT)

#### 5.3 Computing average expenditures for scaling #####

expd <- expd %>% mutate(
  EX_FD_AVG = rowMeans(expd[,c("EX_FD_193","EX_FD_194","EX_FD_201",
                               "EX_FD_202", "EX_FD_203", "EX_FD_204", "EX_FD_211")], na.rm=TRUE),
  
  EX_SN_AVG = rowMeans(expd[,c("EX_SN_193","EX_SN_194","EX_SN_201",
                               "EX_SN_202", "EX_SN_203", "EX_SN_204", "EX_SN_211")], na.rm=TRUE),
  
  EX_N_AVG = rowMeans(expd[,c("EX_N_193","EX_N_194","EX_N_201",
                              "EX_N_202", "EX_N_203", "EX_N_204", "EX_N_211")], na.rm=TRUE),
  
  EX_T_AVG = rowMeans(expd[,c("EX_T_193","EX_T_194","EX_T_201",
                              "EX_T_202", "EX_T_203","EX_T_204","EX_T_211")], na.rm=TRUE),
  
  EX_FD_HM_AVG = rowMeans(expd[,c("EX_FD_HM_193","EX_FD_HM_194","EX_FD_HM_201",
                                  "EX_FD_HM_202", "EX_FD_HM_203","EX_FD_HM_204","EX_FD_HM_211")], na.rm=TRUE),
  
  EX_FD_AW_AVG = rowMeans(expd[,c("EX_FD_AW_193","EX_FD_AW_194","EX_FD_AW_201",
                                  "EX_FD_AW_202", "EX_FD_AW_203", "EX_FD_AW_204", "EX_FD_AW_211")], na.rm=TRUE),
  
  EX_ALC_AVG = rowMeans(expd[,c("EX_ALC_193","EX_ALC_194","EX_ALC_201",
                                "EX_ALC_202", "EX_ALC_203","EX_ALC_204","EX_ALC_211")], na.rm=TRUE),
  
  EX_UT_HO_AVG = rowMeans(expd[,c("EX_UT_HO_193","EX_UT_HO_194","EX_UT_HO_201",
                                  "EX_UT_HO_202", "EX_UT_HO_203","EX_UT_HO_204","EX_UT_HO_211")], na.rm=TRUE),
  
  EX_PC_MIS_AVG = rowMeans(expd[,c("EX_PC_MIS_193","EX_PC_MIS_194","EX_PC_MIS_201",
                                   "EX_PC_MIS_202", "EX_PC_MIS_203", "EX_PC_MIS_204", "EX_PC_MIS_211")], na.rm=TRUE),
  
  EX_TR_GAS_AVG = rowMeans(expd[,c("EX_TR_GAS_193","EX_TR_GAS_194","EX_TR_GAS_201",
                                   "EX_TR_GAS_202", "EX_TR_GAS_203", "EX_TR_GAS_204", "EX_TR_GAS_211")], na.rm=TRUE),
  
  EX_TBC_AVG = rowMeans(expd[,c("EX_TBC_193","EX_TBC_194","EX_TBC_201",
                                "EX_TBC_202", "EX_TBC_203", "EX_TBC_204", "EX_TBC_211")], na.rm=TRUE),
  
  EX_APR_AVG = rowMeans(expd[,c("EX_APR_193","EX_APR_194","EX_APR_201",
                                "EX_APR_202", "EX_APR_203","EX_APR_204","EX_APR_211")], na.rm=TRUE),
  
  EX_HLT_AVG = rowMeans(expd[,c("EX_HLT_193","EX_HLT_194","EX_HLT_201",
                                "EX_HLT_202", "EX_HLT_203","EX_HLT_204","EX_HLT_211")], na.rm=TRUE),
  
  EX_READ_AVG = rowMeans(expd[,c("EX_READ_193","EX_READ_194","EX_READ_201",
                                 "EX_READ_202", "EX_READ_203","EX_READ_204","EX_READ_211")], na.rm=TRUE),
  
  EX_HS_AVG = rowMeans(expd[,c("EX_HS_193","EX_HS_194","EX_HS_201",
                               "EX_HS_202", "EX_HS_203","EX_HS_204","EX_HS_211")], na.rm=TRUE),
  
  EX_EDU_AVG = rowMeans(expd[,c("EX_EDU_193","EX_EDU_194","EX_EDU_201",
                                "EX_EDU_202", "EX_EDU_203","EX_EDU_204","EX_EDU_211")], na.rm=TRUE),
  
  EX_ENT_AVG = rowMeans(expd[,c("EX_ENT_193","EX_ENT_194","EX_ENT_201",
                                "EX_ENT_202", "EX_ENT_203","EX_ENT_204","EX_ENT_211" )], na.rm=TRUE),
  
  EX_TRANS_AVG = rowMeans(expd[,c("EX_TRANS_193","EX_TRANS_194","EX_TRANS_201",
                                  "EX_TRANS_202", "EX_TRANS_203","EX_TRANS_204","EX_TRANS_211")], na.rm=TRUE),
  
  EX_CACT_AVG = rowMeans(expd[,c("EX_CACT_193","EX_CACT_194","EX_CACT_201",
                                 "EX_CACT_202", "EX_CACT_203","EX_CACT_204","EX_CACT_211")], na.rm=TRUE)) %>% 
  
  select(ID,EX_FD_AVG,EX_SN_AVG,EX_N_AVG,EX_T_AVG,EX_FD_HM_AVG,EX_FD_AW_AVG,EX_ALC_AVG,EX_UT_HO_AVG,
         EX_PC_MIS_AVG,EX_TR_GAS_AVG,EX_TBC_AVG,EX_APR_AVG,EX_HLT_AVG,EX_READ_AVG,
         EX_HS_AVG,EX_EDU_AVG,EX_ENT_AVG,EX_TRANS_AVG,EX_CACT_AVG) # Only ID and averages

### 6 Merge 4 and 5 results into df and re-arrange variables ####

#### 6.1 Merge weights, income, liquidity, and average expenditure into df ####
# merge df with weights, income, and liquidity
df <- merge(df, wts_inc_liq, by="ID")
# merge df with avergae expenditure 
df <- merge(df, expd, by="ID")

#### 6.2 Final re-arrange ####
df <- df %>% select(
  ID, NEWID, YYMM, INTERI, FINLWT21_AVG,FINCBTXM_FST,LIQUDYRX,
  
  # basic EIPIs
  EIPI_t, EIPI_tm1, EIPI_tm2, iEIPI_t, iEIPI_tm1, iEIPI_tm2, EIPI_t_count,
  
  # Basic expenditure 
  d_EX_FD_t, d_EX_SN_t, d_EX_N_t, d_EX_T_t,
  
  # Controls   
  d_NUM_ADTS_t, d_NUM_KIDS_t, AGE_AVG_t,   
  
  # Demographics for cleaning
  d_AGE_REF_t, d_AGE_2, d_FAM_SIZE_t, NUM_KIDS_t, NUM_ADTS_t, FAM_SIZE_t, 
  AGE_REF_t, AGE2_t, SEX_REF_t, SEX_REF_tm1, MARITAL1_t, MARITAL1_tm1, CUTENURE_t,
  NUM_KIDS_tm1, NUM_ADTS_tm1, 
  
  # More EIPIs
  EIPI_by_ck_t, EIPI_by_dd_t, EIPI_by_dc_t,
  EIPI_for_ep_t, EIPI_for_debt_t, EIPI_for_sv_t, iEIPI_by_ck_t, iEIPI_by_dd_t , iEIPI_by_dc_t,
  iEIPI_for_ep_t, iEIPI_for_debt_t, iEIPI_for_sv_t, 
  EIPI_apr_t,EIPI_may_t,EIPI_jun_t,EIPI_jul_t,EIPI_aug_t,EIPI_sep_t,
  
  EIPI_by_ck_tm1, EIPI_by_dd_tm1, EIPI_by_dc_tm1,
  EIPI_for_ep_tm1, EIPI_for_debt_tm1, EIPI_for_sv_tm1, iEIPI_by_ck_tm1, iEIPI_by_dd_tm1, iEIPI_by_dc_tm1,
  iEIPI_for_ep_tm1, iEIPI_for_debt_tm1, iEIPI_for_sv_tm1, 
  EIPI_apr_tm1,EIPI_may_tm1,EIPI_jun_tm1,EIPI_jul_tm1,EIPI_aug_tm1,EIPI_sep_tm1,
  
  EIPI_by_ck_tm2, EIPI_by_dd_tm2, EIPI_by_dc_tm2,
  EIPI_for_ep_tm2, EIPI_for_debt_tm2, EIPI_for_sv_tm2, iEIPI_by_ck_tm2, iEIPI_by_dd_tm2, iEIPI_by_dc_tm2,
  iEIPI_for_ep_tm2, iEIPI_for_debt_tm2, iEIPI_for_sv_tm2, 
  EIPI_apr_tm2,EIPI_may_tm2,EIPI_jun_tm2,EIPI_jul_tm2,EIPI_aug_tm2,EIPI_sep_tm2,
  
  # EIPI Status dummies
  r, ck, dd, dc, ck_dd, ck_dc, dd_dc, ck_dd_dc,
  ep, debt, sv, ep_debt, ep_sv, debt_sv, ep_debt_sv, 
  
  # Expenditure levels 
  EX_FD_t, EX_FD_tm1, EX_SN_t, EX_SN_tm1, EX_N_t, EX_N_tm1, EX_T_t, EX_T_tm1,
  
  # Expenditure subcategories 
  d_EX_FD_HM_t, d_EX_FD_AW_t, d_EX_ALC_t, d_EX_UT_HO_t, d_EX_PC_MIS_t, d_EX_TR_GAS_t,
  d_EX_TBC_t, d_EX_APR_t, d_EX_HLT_t, d_EX_READ_t, d_EX_HS_t, d_EX_EDU_t, d_EX_ENT_t,
  d_EX_TRANS_t, d_EX_CACT_t,
  
  # Scalers
  EX_FD_AVG, EX_SN_AVG, EX_N_AVG, EX_T_AVG,
  
  EX_FD_HM_AVG, EX_FD_AW_AVG,
  EX_ALC_AVG, EX_UT_HO_AVG, EX_PC_MIS_AVG, EX_TR_GAS_AVG, EX_TBC_AVG, EX_APR_AVG,
  EX_HLT_AVG, EX_READ_AVG, EX_HS_AVG, EX_EDU_AVG, EX_ENT_AVG, EX_TRANS_AVG,
  EX_CACT_AVG)

### 7 Clean df to get the two samples ####

#### 7.1 All households sample ####
# We reduce the sample step by step s
# so we can trace out the change in sample size after each step

# Drop if lives in student housing
df_all_cu <- df %>% filter(CUTENURE_t != 6)

# Drop if age_ref < 21 or age2 < 21 (or > 85)
df_all_cu <- df_all_cu %>% filter(AGE_REF_t >= 21 & AGE_REF_t <= 85)
df_all_cu <- df_all_cu %>% filter(is.na(AGE2_t) | (AGE2_t >= 21 & AGE2_t <= 85))

# Drop if age_ref change is greater than 1 or less than 0 (if the sex of the reference person is the same)
df_all_cu$drop <- ifelse(((df_all_cu$d_AGE_REF_t > 1 | df_all_cu$d_AGE_REF_t < 0) & df_all_cu$SEX_REF_t == df_all_cu$SEX_REF_tm1), 1, 0)
df_all_cu <- df_all_cu %>% filter(df_all_cu$drop==0)

#  Drop if age2 change is greater than 1 or less than 0 (if the reference person has the same sex or marital status)
df_all_cu$drop <- ifelse(((df_all_cu$d_AGE_2 > 1 | df_all_cu$d_AGE_2 < 0) & df_all_cu$SEX_REF_t == df_all_cu$SEX_REF_tm1
                          & df_all_cu$MARITAL1_t == df_all_cu$MARITAL1_tm1), 1, 0)
df_all_cu <- df_all_cu %>% filter(df_all_cu$drop==0|is.na(df_all_cu$drop))

# Drop if change in family size is greater than or less than 3 in absolute values
df_all_cu <- df_all_cu %>% filter (d_FAM_SIZE_t <= 3 & d_FAM_SIZE_t >= -3)

# Drop bottom 1 percent of CUs  in terms of nondurable consumption after adjustment for CU size and time trend
df_all_cu$EX_N_PC <- df_all_cu$EX_N_t / (df_all_cu$NUM_ADTS_t + 0.6 *df_all_cu$NUM_KIDS_t)

df_all_cu <- df_all_cu %>% mutate(
  TT = ifelse(YYMM==1912,0,
              ifelse(YYMM==2001,1,
                     ifelse(YYMM==2003,3,
                            ifelse(YYMM==2004,4,
                                   ifelse(YYMM==2006,6,
                                          ifelse(YYMM==2007,7,
                                                 ifelse(YYMM==2009,9,
                                                        ifelse(YYMM==2010,10,12)))))))))

# Quantile regression of per capita consumption on time trend for the bottom 1%
qr_bot <- rq(data=df_all_cu,EX_N_PC~TT,tau=0.01)

summary(qr_bot)

df_all_cu$fit_val_bot <- qr_bot[["fitted.values"]]

df_all_cu$drop <- ifelse(df_all_cu$fit_val_bot > df_all_cu$EX_N_PC, 1, 0)

df_all_cu <- df_all_cu %>% filter(df_all_cu$drop==0) %>% select(-c(EX_N_PC,fit_val_bot,drop,TT))
write.csv(df_all_cu,"df_all_cu.csv",row.names = FALSE)

#### 7.2 Final Panel ####

# drop if lives in student housing
df_f <- df %>% filter(CUTENURE_t != 6)

# drop if age_ref < 21 or age2 < 21
df_f <- df_f %>% filter(AGE_REF_t >= 21)
df_f <- df_f %>% filter(is.na(AGE2_t) | AGE2_t >= 21)

# drop if age_ref change is greater than 1 or less than 0 (if the sex of the reference person is the same)
df_f$drop <- ifelse(((df_f$d_AGE_REF_t > 1 | df_f$d_AGE_REF_t < 0) & df_f$SEX_REF_t == df_f$SEX_REF_tm1), 1, 0)
df_f <- df_f %>% filter(df_f$drop==0)

#  drop if age2 change is greater than 1 or less than 0 (if the reference person has the same sex or marital status)
df_f$drop <- ifelse(((df_f$d_AGE_2 > 1 | df_f$d_AGE_2 < 0) & df_f$SEX_REF_t == df_f$SEX_REF_tm1
                     & df_f$MARITAL1_t == df_f$MARITAL1_tm1), 1, 0)
df_f <- df_f %>% filter(df_f$drop==0|is.na(df_f$drop))


# drop if change in family size is greater than or less than 3 in absolute values
df_f <- df_f %>% filter (d_FAM_SIZE_t <= 3 & d_FAM_SIZE_t >= -3)

# drop bottom 1 percent of CUs in terms of non-durable consumption in each month after adjustment for CU size
df_f$EX_N_PC <- df_f$EX_N_t / (df_f$NUM_ADTS_t + 0.6 *df_f$NUM_KIDS_t)

df_f %>%
  group_by(YYMM) %>%
  summarize(quant1 = quantile(EX_N_PC, probs = 0.01)) %>%
  ungroup()

df_f$drop <- ifelse((df_f$YYMM == 1912 & df_f$EX_N_PC <= 657), 1, 
                    ifelse((df_f$YYMM == 2001 & df_f$EX_N_PC <= 375), 1,
                           ifelse((df_f$YYMM == 2003 & df_f$EX_N_PC <= 626), 1, 
                                  ifelse((df_f$YYMM == 2004 & df_f$EX_N_PC <= 576), 1,
                                         ifelse((df_f$YYMM == 2006 & df_f$EX_N_PC <= 633), 1,
                                                ifelse((df_f$YYMM == 2007 & df_f$EX_N_PC <= 642), 1,
                                                       ifelse((df_f$YYMM == 2009 & df_f$EX_N_PC <= 686), 1,
                                                              ifelse((df_f$YYMM == 2010 & df_f$EX_N_PC <= 636), 1,
                                                                     ifelse((df_f$YYMM == 2012 & df_f$EX_N_PC <= 634), 1,
                                                                            ifelse((df_f$YYMM == 2101 & df_f$EX_N_PC <= 779), 1, 0
                                                                            ))))))))))
df_f <- df_f %>% filter(df_f$drop==0)

# Drop high income
df_f$MARITAL_t <- ifelse(df_f$MARITAL1_t == 1, 1, 0)

#### Income cutoff table ####

# For single, without kids

# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 100000 & FINCBTXM_FST > 75000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 125000 & FINCBTXM_FST > 100000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 150000 & FINCBTXM_FST > 125000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 175000 & FINCBTXM_FST > 150000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 200000 & FINCBTXM_FST > 175000)
# table(check$r)

# For single, with kids

# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 150000 & FINCBTXM_FST > 125000)
# table(check$r)
# #
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 175000 & FINCBTXM_FST > 150000)
# table(check$r)
# #
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 200000 & FINCBTXM_FST > 175000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 225000 & FINCBTXM_FST > 200000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t ==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
# table(check$r)

# For married, no kids
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 200000 & FINCBTXM_FST > 175000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 225000 & FINCBTXM_FST > 200000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 275000 & FINCBTXM_FST > 250000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 300000 & FINCBTXM_FST > 275000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 325000 & FINCBTXM_FST > 300000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 350000 & FINCBTXM_FST > 325000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 375000 & FINCBTXM_FST > 350000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 400000 & FINCBTXM_FST > 375000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 425000 & FINCBTXM_FST > 400000)
# table(check$r)

# For married, with kids
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 225000 & FINCBTXM_FST > 200000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 275000 & FINCBTXM_FST > 250000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 300000 & FINCBTXM_FST > 275000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 325000 & FINCBTXM_FST > 300000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 350000 & FINCBTXM_FST > 325000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 375000 & FINCBTXM_FST > 350000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 400000 & FINCBTXM_FST > 375000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==1 & NUM_KIDS_t >0 & FINCBTXM_FST < 425000 & FINCBTXM_FST > 400000)
# table(check$r)

# For adults, no kids

# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 225000 & FINCBTXM_FST > 200000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 275000 & FINCBTXM_FST > 250000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 300000 & FINCBTXM_FST > 275000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 325000 & FINCBTXM_FST > 300000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 350000 & FINCBTXM_FST > 325000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 375000 & FINCBTXM_FST > 350000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 400000 & FINCBTXM_FST > 375000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 425000 & FINCBTXM_FST > 400000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t ==0 & FINCBTXM_FST < 450000 & FINCBTXM_FST > 425000)
# table(check$r)

# For adults, with kids

# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 250000 & FINCBTXM_FST > 225000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 275000 & FINCBTXM_FST > 250000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 300000 & FINCBTXM_FST > 275000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 325000 & FINCBTXM_FST > 300000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 350000 & FINCBTXM_FST > 325000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 375000 & FINCBTXM_FST > 350000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 400000 & FINCBTXM_FST > 375000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 425000 & FINCBTXM_FST > 400000)
# table(check$r)
# 
# check <- df_f %>% filter(MARITAL_t==0 & NUM_ADTS_t > 1 & NUM_KIDS_t > 0 & FINCBTXM_FST < 450000 & FINCBTXM_FST > 425000)
# table(check$r)


df_f$drop <- ifelse((df_f$MARITAL_t==0 & df_f$NUM_ADTS_t==1 & df_f$NUM_KIDS_t ==0 & df_f$FINCBTXM_FST <= 175000), 0,
                    ifelse((df_f$MARITAL_t==0 & df_f$NUM_ADTS_t==1 & df_f$NUM_KIDS_t >0 & df_f$FINCBTXM_FST <= 225000), 0,
                           ifelse((df_f$MARITAL_t==1 & df_f$NUM_KIDS_t ==0 & df_f$FINCBTXM_FST <= 400000), 0,
                                  ifelse((df_f$MARITAL_t==1 & df_f$NUM_KIDS_t >0 & df_f$FINCBTXM_FST <= 400000), 0,
                                         ifelse((df_f$MARITAL_t==0 & df_f$NUM_ADTS_t > 1 & df_f$NUM_KIDS_t ==0 & df_f$FINCBTXM_FST <= 425000), 0,
                                                ifelse((df_f$MARITAL_t==0 & df_f$NUM_ADTS_t > 1 & df_f$NUM_KIDS_t >0 & df_f$FINCBTXM_FST <= 425000), 0, 1
                                                ))))))

df_f <- df_f %>% filter(df_f$drop==0) %>% select(-c(EX_N_PC, drop, MARITAL_t))
write.csv(df_f,"df_f.csv", row.names = FALSE)

### 8 Create imputed value of EIP1 ###

# Imprt imputed EIP data
imput_eip <- read_excel("Raw data/imputed_eip.xlsx")

#### 8.1 Restrict df_f to only observations on the 2006 or 2007 interview cycle
df_f_imp <- df_f %>% group_by(ID) %>% filter(any(YYMM == 2006 | YYMM == 2007 | YYMM == 1912 | YYMM == 2001 | YYMM == 2003 | 
                                                 YYMM == 2004 | YYMM == 2009 | YYMM == 2010 | YYMM == 2012 | YYMM == 2101)) 

#### 8.2 Mege df_f_imp with imputed values in IMPUT_EIP.xlsx
####    These imputations were created using tax unit level data on AGI and number of dependents internal to the BLS 
df_f_imp <- merge(df_f_imp, imput_eip, by.x="NEWID", by.y = "newid")


#### 8.3 Create imputed value for EIPI_t
df_f_imp <- df_f_imp %>% mutate(imp_eip1t = ifelse(EIPI_t == imp_eip1_1, imp_eip1_1,
                                                   ifelse(EIPI_t == imp_eip1_2, imp_eip1_2,
                                                          ifelse(EIPI_t == imp_eip1_3, imp_eip1_3,
                                                                 ifelse(EIPI_t == imp_eip1_4, imp_eip1_4,0)))))

df_f_imp <- df_f_imp %>% mutate(imp_eip1t = ifelse((YYMM == 2006 | YYMM == 2007) & EIPI_t == 0,
                                                   ifelse(INTERI == 1,imp_eip1_1,
                                                          ifelse(INTERI == 2,imp_eip1_2,
                                                                 ifelse(INTERI == 3,imp_eip1_3,
                                                                        ifelse(INTERI == 4,imp_eip1_4,-100)))),imp_eip1t))

df_f_imp <- df_f_imp %>% mutate(imp_eip1t = ifelse((YYMM == 2006 | YYMM == 2007) & EIPI_t > 0 & imp_eip1t == 0,
                                                   ifelse(INTERI == 1,imp_eip1_1,
                                                          ifelse(INTERI == 2,imp_eip1_2,
                                                                 ifelse(INTERI == 3,imp_eip1_3,
                                                                        ifelse(INTERI == 4,imp_eip1_4,-200)))),imp_eip1t))

#### 8.4 Create imputed value for lagged EIPI
df_f_imp <- df_f_imp %>% group_by(ID) %>% arrange(YYMM, .by_group = TRUE) %>%
  mutate(imp_eip1tm1 = lag(imp_eip1t, n=1, default = 0),
         imp_eip1tm2 = lag(imp_eip1t, n=2, default = 0))

#### 8.5 Assign lagged imputed EIPI value to CUs without observation in 2006 or 2007
df_f_imp <- df_f_imp %>% mutate(imp_eip1tm1 = ifelse((YYMM == 2009 | YYMM == 2010) & imp_eip1tm1 == 0,
                                                     ifelse(INTERI == 1, NA,
                                                            ifelse(INTERI == 2, imp_eip1_1,
                                                                   ifelse(INTERI == 3, imp_eip1_2,
                                                                          ifelse(INTERI == 4, imp_eip1_4,-300)))),imp_eip1tm1))


df_f_imp <- df_f_imp %>% mutate(imp_eip1tm2 = ifelse((YYMM == 2012 | YYMM == 2101) & imp_eip1tm2 == 0,
                                                     ifelse(INTERI == 1, NA,
                                                            ifelse(INTERI == 2, NA,
                                                                   ifelse(INTERI == 3, imp_eip1_1,
                                                                          ifelse(INTERI == 4, imp_eip1_2,-300)))),imp_eip1tm2))

#### 8.6 Create categorical representation of observed and imputed EIP value 

df_f_imp <- df_f_imp %>% mutate(eip1t_cat = ifelse(EIPI_t > 0 & EIPI_t < 1200,1,
                                                   ifelse(EIPI_t == 1200,2,
                                                          ifelse(EIPI_t > 1200 & EIPI_t < 2400,3,
                                                                 ifelse(EIPI_t == 2400,4,
                                                                        ifelse(EIPI_t > 2400 & EIPI_t < 3600,5,
                                                                               ifelse(EIPI_t == 3600,6,
                                                                                      ifelse(EIPI_t > 3600 & EIPI_t < 4200,7,
                                                                                             ifelse(EIPI_t == 4200,8,
                                                                                                    ifelse(EIPI_t > 4200,9,0))))))))))

df_f_imp <- df_f_imp %>% mutate(impeip1t_cat = ifelse(imp_eip1t > 0 & imp_eip1t < 1200,1,
                                                      ifelse(imp_eip1t == 1200,2,
                                                             ifelse(imp_eip1t > 1200 & imp_eip1t < 2400,3,
                                                                    ifelse(imp_eip1t == 2400,4,
                                                                           ifelse(imp_eip1t > 2400 & imp_eip1t < 3600,5,
                                                                                  ifelse(imp_eip1t == 3600,6,
                                                                                         ifelse(imp_eip1t > 3600 & imp_eip1t < 4200,7,
                                                                                                ifelse(imp_eip1t == 4200,8,
                                                                                                       ifelse(imp_eip1t > 4200,9,0))))))))))

df_f_imp <- df_f_imp %>% select(-c(imp_eip2_1,imp_eip2_2,imp_eip2_3,imp_eip2_4,imp_eip3_1,imp_eip3_2,imp_eip3_3,imp_eip3_4))
write.csv(df_f_imp,"df_f_imp.csv", row.names = FALSE)

# Keep only df, all CU sample, the final sample, and the final sample with imputations
rm(list=setdiff(ls(), c("df","df_all_cu","df_f","df_f_imp")))

